full_data <- readRDS('data/full_data.rds')

Monthly Production Data

prod_by_month <- full_data %>% 
    select(yearmonth, active_1km, monthly_oil_1km, monthly_gas_1km, 
           monthly_water_1km, monthly_boe_1km, active_2p5km, monthly_oil_2p5km, 
           monthly_gas_2p5km, monthly_water_2p5km, monthly_boe_2p5km, active_5km, 
           monthly_oil_5km, monthly_gas_5km, monthly_water_5km, monthly_boe_5km) %>%
  distinct() %>%
  filter(yearmonth <= '2023-03')

coef <- median(1032*prod_by_month$monthly_oil_1km/10^9)/median(prod_by_month$monthly_gas_1km/10^6)

full_data %>%
  ggplot(aes(x = yearmonth, group = 1)) + 
  geom_line(aes(y = monthly_oil_1km/10^6, color = 'Oil'), size=1) + 
  geom_line(aes(y = 1032*monthly_gas_1km/10^9/coef, color = 'Natural Gas'), size=1) +
  scale_y_continuous(
    name = "Monthly Oil in Million Barrels",
    sec.axis = sec_axis(~.*coef, name="Monthly Gas in Billion Cubic Feet")
  ) +
  theme(
    axis.title.y = element_text(color = 'tomato'),
    axis.title.y.right = element_text(color = 'skyblue')
  ) +
  scale_color_manual(values = c("skyblue", "tomato")) +
  scale_x_discrete(breaks = prod_by_month$yearmonth[seq(1, length(prod_by_month$yearmonth), by = 7)]) +
  xlab("Date") + labs(colour="Production Type") +
    theme_minimal() +
  theme(axis.text.x = element_text(angle = 45))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 63301 rows containing missing values (`geom_line()`).
## Removed 63301 rows containing missing values (`geom_line()`).

Visualizing Daily Max H2S

H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 123 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1724: `day = 2020-08-22`, `Monitor = "Judson"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 122 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
  scale_y_continuous(limits=c(0, 50)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')

The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10 Maybe we should start from January 2022 to start modelling to remove the extreme values, or simply removing the 213 monitor since that’s set up specifically for the event.

Visualizing Daily Average H2S

# Compute daily average
H2S_da <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_da_graph <- H2S_da %>%
 ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Average H2S concentration by monitor", y = 'Daily Average H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_da_graph) %>% layout(dragmode = 'pan')
H2S_da_graph_b <- H2S_da %>%
 ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Average H2S concentration by monitor w/o outlier", y = 'Daily Avearge H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
full_data %>%
  polarPlot(pollutant = "H2S", col = "turbo", 
            key.position = "bottom",
            key.header = "mean H2S", 
            key.footer = NULL, title = 'hi')

# Create a polar map
# TBD: include markers for refineries
polarplot <- polarMap(
  full_data %>% 
    filter(!(is.na(latitude.x) | is.na(H2S) | is.na(wd) | is.na(ws))) %>%
    rename(date = DateTime),
  pollutant = "H2S",
  latitude = "latitude.x",
  longitude = "longitude.x",
  popup = "Monitor",
  provider = "Esri.WorldImagery",
  d.icon = 150,
  d.fig = 2.5,
  alpha = 0.75
)
## 
Creating Polar Markers â– â– â– â– â– â–  15% | ETA: 9s

Creating Polar Markers â– â– â– â– â– â– â– â–  23%
## | ETA: 9s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â–  31% | ETA: 7s

Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â–  38% | ETA: 7s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  46% | ETA:
## 5s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  54% | ETA: 5s

Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  62% | ETA: 4s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 
## 69% | ETA: 3s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  77% | ETA:
## 2s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  85% | ETA: 2s

Creating Polar
## Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  92% | ETA: 1s

polarplot
#saveWidget(polarplot, file="polarplot.html")

Downwind Indicator wrp Refinery

# Create variable to indicate downwind wrt refinery
wind_diff <- abs(full_data$Converted_Angle - 180 - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_ref <- as.integer(wind_diff <= 15)

Find angle between monitor and nearest water treatment plant

# Function to calculate the angle between two lat/long points
calculate_angle <- function(lat1, lon1, lat2, lon2) {

  # Calculate the bearing between the two points
  angle <- bearing(cbind(lon1, lat1), cbind(lon2, lat2))
  
  # Return the angle
  return(angle)
}

wrp <- full_data %>% 
  select(name, wrp_utm_x, wrp_utm_y) %>%
  distinct()

wrp_coord <- cbind(wrp, terra::project(as.matrix(cbind(wrp$wrp_utm_x, wrp$wrp_utm_y)), 
                            "+proj=utm +zone=11 ellps=WGS84", 
                            "+proj=longlat +datum=WGS84")) %>%
  rename(longitude.wrp = '1',
         latitude.wrp = '2')

full_data <- full_data %>%
  left_join(wrp_coord %>% select(name, latitude.wrp, longitude.wrp), join_by(name), keep=F)

mon_wrp <- full_data %>% 
              select(Monitor, name, latitude.x, longitude.x, latitude.wrp, longitude.wrp) %>%
              distinct()

mon_wrp$wrp_angle <- calculate_angle(mon_wrp$latitude.x, mon_wrp$longitude.x, 
                                     mon_wrp$latitude.wrp, mon_wrp$longitude.wrp)

mon_wrp$wrp_angle <- if_else(mon_wrp$wrp_angle < 0, mon_wrp$wrp_angle + 360, mon_wrp$wrp_angle)

full_data <- full_data %>%
  left_join(mon_wrp %>% select(Monitor, name, wrp_angle), join_by(Monitor, name), keep=F)

# Finally, check if wind direction is downwind w.r.t wrp
wind_diff <- abs(full_data$wrp_angle - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

full_data$downwind_wrp <- as.integer(wind_diff <= 15)
# Compute daily average wd/ws
daily_full <- full_data %>%
  select(H2S, ws, wd, latitude.x, longitude.x, mon_utm_x, mon_utm_y, Monitor, MinDist, 
         Converted_Angle, Refinery, year, month, day, weekday, 
         monthly_oil_1km, monthly_gas_1km, monthly_oil_2p5km, 
         monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km,
         dist_wrp, capacity, active_1km, active_2p5km, active_5km,
         wrp_angle) %>%
  group_by(Monitor, day) %>%
  mutate(H2S_daily_max = max(H2S, na.rm=TRUE),
            H2S_daily_avg = mean(H2S, na.rm=TRUE),
            ws_avg = mean(ws, na.rm=TRUE),
            wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
  mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
  ungroup() %>%
  rename(monitor_lat = latitude.x, monitor_lon = longitude.x) %>%
  mutate(H2S_daily_max = ifelse(H2S_daily_max == -Inf, NA, H2S_daily_max)) %>%
  select(-c(H2S, wd, ws)) %>%
  distinct()
## Warning: There were 2449 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1637: `Monitor = "First Methodist"`, `day = 2022-11-23`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 2448 remaining warnings.
# Get the downwind indicators for daily data
wind_diff <- abs(daily_full$Converted_Angle - 180 - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_ref <- as.integer(wind_diff <= 15)

wind_diff <- abs(daily_full$wrp_angle - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

daily_full$daily_downwind_wrp <- as.integer(wind_diff <= 15)

Get monitor locations

monitors_coord <- full_data %>%
  select(Monitor, latitude.x, longitude.x) %>%
  distinct()

monitors <- monitors_coord %>%
  select(Monitor)
  
coordinates(monitors_coord) <- ~ longitude.x + latitude.x

Load in elevation data

elevation <- raster('shapefiles/N33W119.hgt')

monitors <- cbind(monitors, extract(elevation, monitors_coord)) %>%
  as.data.frame() %>%
  rename(elevation = 2)

Load in EVI data

evi <- raster('shapefiles/MOD13Q1.006__250m_16_days_EVI_doy2022177_aid0001.tif')

monitors <- cbind(monitors, extract(evi, monitors_coord) * 0.0001) %>%
  as.data.frame() %>%
  rename(EVI = 3)

Merge EVI and elevation to data

full_data <- full_data %>%
  left_join(monitors, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors, join_by(Monitor))

Merge Odor Complaint data

odor <- read_csv('data/SCAQMD_odorcomplaints_2018_2022.csv')
## New names:
## Rows: 19913 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, zip, num_odor_complaints date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
la_county <- read_sf('shapefiles/Zip_Codes_(LA_County)/Zip_Codes_(LA_County).shp')
# https://gis.stackexchange.com/questions/282750/identify-polygon-containing-point-with-r-sf-package
pnts_sf <- do.call("st_sfc",c(lapply(1:nrow(monitors_coord@coords), 
function(i) {st_point(as.numeric(monitors_coord@coords[i, ]))}), list("crs" = 4326))) 

pnts_trans <- st_transform(pnts_sf, 2163) # apply transformation to pnts sf
la_county_trans <- st_transform(la_county, 2163)      # apply transformation to polygons sf

# intersect and extract state name
monitors_coord@data$county <- apply(st_intersects(la_county_trans, pnts_trans, sparse = FALSE), 2, 
               function(col) { 
                  la_county_trans[which(col), ]$ZIPCODE
               })
# Plot results to double check
la_county_present <- la_county[la_county$ZIPCODE %in% unique(monitors_coord$county), ]

la_county_present %>% 
  ggplot() +
  geom_sf(aes(fill = ZIPCODE)) +
  geom_point(data = as_tibble(monitors_coord@coords), aes(x = longitude.x, y = latitude.x))

odor <- odor[odor$zip %in% unique(monitors_coord$county), ]

monitor_odor <- monitors_coord@data %>%
  left_join(odor %>% mutate(zip = as.character(zip)), join_by(county == zip)) %>%
  select(-`...1`)
## Warning in left_join(., odor %>% mutate(zip = as.character(zip)), join_by(county == : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 894 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
full_data <- full_data %>%
  left_join(monitors_coord@data, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors_coord@data, join_by(Monitor))

full_data <- full_data %>%
  left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))

full_data <- full_data %>%
  mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))

daily_full <- daily_full %>%
  left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))

daily_full <- daily_full %>%
  mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))

Explore some GAM models

Five minute H2S

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.172e+00  3.213e-03  364.72   <2e-16 ***
## year2023       -3.263e-01  2.800e-03 -116.54   <2e-16 ***
## wd_secQ2       -2.560e-01  3.751e-03  -68.23   <2e-16 ***
## wd_secQ3       -2.898e-01  3.892e-03  -74.46   <2e-16 ***
## wd_secQ4       -2.095e-01  3.492e-03  -59.99   <2e-16 ***
## ws             -5.424e-02  4.176e-04 -129.91   <2e-16 ***
## I(1/MinDist^2)  2.114e+05  2.329e+03   90.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.996      8 2496  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0946   Deviance explained = 9.47%
## GCV = 0.92965  Scale est. = 0.92963   n = 772251
plot(h2s_model_a)

h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   s(latitude.x, longitude.x, bs='tp', k = 8), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     s(latitude.x, longitude.x, bs = "tp", k = 8)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.012e+00  3.171e-03  319.16   <2e-16 ***
## wd_secQ2       -1.954e-01  3.696e-03  -52.87   <2e-16 ***
## wd_secQ3       -1.822e-01  3.905e-03  -46.65   <2e-16 ***
## wd_secQ4       -1.129e-01  3.487e-03  -32.39   <2e-16 ***
## ws             -6.921e-02  4.147e-04 -166.90   <2e-16 ***
## I(1/MinDist^2)  3.002e+05  2.858e+03  105.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df    F p-value    
## s(as.numeric(month))      7.999      8 1708  <2e-16 ***
## s(latitude.x,longitude.x) 6.999      7 6523  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.13   Deviance explained =   13%
## GCV = 0.8932  Scale est. = 0.89317   n = 772251
plot(h2s_model_b)

# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws + 
                     I(1/MinDist^2) + Converted_Angle + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     as.factor(weekday), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Converted_Angle + s(latitude.x, longitude.x, 
##     bs = "tp", k = 10) + as.factor(weekday)
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.420e+00  2.374e-02  101.947  < 2e-16 ***
## year2023             -3.327e-01  2.864e-03 -116.135  < 2e-16 ***
## wd_secQ2             -1.858e-01  3.643e-03  -51.011  < 2e-16 ***
## wd_secQ3             -1.897e-01  3.846e-03  -49.325  < 2e-16 ***
## wd_secQ4             -1.090e-01  3.441e-03  -31.663  < 2e-16 ***
## ws                   -6.128e-02  4.076e-04 -150.319  < 2e-16 ***
## I(1/MinDist^2)        1.112e-04  1.961e-06   56.689  < 2e-16 ***
## Converted_Angle      -5.775e-03  1.129e-04  -51.138  < 2e-16 ***
## as.factor(weekday).L  9.867e-02  2.801e-03   35.228  < 2e-16 ***
## as.factor(weekday).Q -1.699e-01  2.808e-03  -60.505  < 2e-16 ***
## as.factor(weekday).C -3.883e-03  2.803e-03   -1.385    0.166    
## as.factor(weekday)^4 -2.054e-02  2.814e-03   -7.299  2.9e-13 ***
## as.factor(weekday)^5 -5.798e-02  2.807e-03  -20.654  < 2e-16 ***
## as.factor(weekday)^6 -2.507e-02  2.820e-03   -8.891  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df    F p-value    
## s(as.numeric(month))      7.997      8 2708  <2e-16 ***
## s(latitude.x,longitude.x) 8.999      9 6588  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.156   Deviance explained = 15.6%
## GCV = 0.86621  Scale est. = 0.86618   n = 772251
plot(h2s_model_c)

# Include converted angle and weekday
h2s_model_d <- gam(H2S ~ 
                     s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + dist_wrp + capacity +
                     Converted_Angle + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     monthly_oil_1km + monthly_gas_1km + active_1km
                     , 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     dist_wrp + capacity + Converted_Angle + s(latitude.x, longitude.x, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -2.757e+00  5.816e-02  -47.406  < 2e-16 ***
## year2023             -1.356e-01  6.865e-03  -19.751  < 2e-16 ***
## as.factor(weekday).L  1.082e-01  2.989e-03   36.201  < 2e-16 ***
## as.factor(weekday).Q -1.838e-01  2.995e-03  -61.371  < 2e-16 ***
## as.factor(weekday).C -8.157e-04  2.981e-03   -0.274  0.78432    
## as.factor(weekday)^4 -1.475e-02  2.983e-03   -4.945 7.61e-07 ***
## as.factor(weekday)^5 -5.943e-02  2.975e-03  -19.979  < 2e-16 ***
## as.factor(weekday)^6 -2.875e-02  2.985e-03   -9.631  < 2e-16 ***
## wd_secQ2             -1.520e-01  3.935e-03  -38.615  < 2e-16 ***
## wd_secQ3             -1.310e-01  4.161e-03  -31.492  < 2e-16 ***
## wd_secQ4             -7.165e-02  3.677e-03  -19.487  < 2e-16 ***
## ws                   -7.086e-02  4.333e-04 -163.531  < 2e-16 ***
## downwind_ref          1.444e-01  4.136e-03   34.917  < 2e-16 ***
## downwind_wrp         -1.370e-02  4.575e-03   -2.993  0.00276 ** 
## I(1/MinDist^2)        4.732e-05  9.908e-07   47.760  < 2e-16 ***
## dist_wrp              5.576e-04  6.067e-06   91.915  < 2e-16 ***
## capacity              4.449e-03  4.555e-05   97.666  < 2e-16 ***
## Converted_Angle      -2.954e-03  1.232e-04  -23.966  < 2e-16 ***
## monthly_oil_1km       5.172e-05  2.488e-06   20.787  < 2e-16 ***
## monthly_gas_1km       2.546e-04  1.226e-05   20.767  < 2e-16 ***
## active_1km           -2.863e-02  1.076e-03  -26.604  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df      F p-value    
## s(as.numeric(month))      7.998  8.000  988.2  <2e-16 ***
## s(latitude.x,longitude.x) 8.003  8.004 5198.0  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 36/38
## R-sq.(adj) =  0.162   Deviance explained = 16.2%
## GCV = 0.90017  Scale est. = 0.90012   n = 711593
plot(h2s_model_d)

# Include elvation, EVI, 3D smooth, odor
h2s_model_e <- gam(H2S ~ 
                     s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
                     Converted_Angle + elevation + EVI + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                     monthly_oil_1km + monthly_gas_1km + active_1km +
                     num_odor_complaints
                     , 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(latitude.x, longitude.x, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           3.614e+00  2.308e-01   15.658  < 2e-16 ***
## year2023              2.185e-02  1.754e-02    1.246    0.213    
## as.factor(weekday).L  1.076e-01  2.862e-03   37.580  < 2e-16 ***
## as.factor(weekday).Q -1.769e-01  2.869e-03  -61.675  < 2e-16 ***
## as.factor(weekday).C  4.129e-03  2.848e-03    1.450    0.147    
## as.factor(weekday)^4 -1.431e-02  2.848e-03   -5.025 5.03e-07 ***
## as.factor(weekday)^5 -6.237e-02  2.840e-03  -21.958  < 2e-16 ***
## as.factor(weekday)^6 -2.703e-02  2.855e-03   -9.470  < 2e-16 ***
## wd_secQ2             -1.355e-01  3.770e-03  -35.952  < 2e-16 ***
## wd_secQ3             -1.676e-01  4.006e-03  -41.831  < 2e-16 ***
## wd_secQ4             -9.811e-02  3.529e-03  -27.803  < 2e-16 ***
## ws                   -6.682e-02  4.167e-04 -160.365  < 2e-16 ***
## downwind_ref          1.107e-01  3.971e-03   27.886  < 2e-16 ***
## downwind_wrp         -4.663e-03  4.373e-03   -1.066    0.286    
## I(1/MinDist^2)       -5.018e-07  7.849e-06   -0.064    0.949    
## I(1/dist_wrp^2)       1.899e-06  3.608e-07    5.264 1.41e-07 ***
## capacity              1.108e-02  1.242e-03    8.921  < 2e-16 ***
## wrp_angle             3.887e-03  2.075e-04   18.732  < 2e-16 ***
## Converted_Angle      -2.337e-02  2.559e-03   -9.133  < 2e-16 ***
## elevation            -2.407e-01  1.314e-02  -18.328  < 2e-16 ***
## EVI                   3.454e+00  2.639e-01   13.088  < 2e-16 ***
## monthly_oil_1km       5.898e-05  4.933e-06   11.956  < 2e-16 ***
## monthly_gas_1km      -3.534e-05  2.793e-05   -1.265    0.206    
## active_1km           -1.808e-02  1.844e-03   -9.805  < 2e-16 ***
## num_odor_complaints   3.483e-02  1.936e-03   17.992  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.992  8.000 383.3
## s(latitude.x,longitude.x)                                1.678  1.678 126.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.434 81.435 860.7
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(latitude.x,longitude.x)                                <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 120/131
## R-sq.(adj) =  0.239   Deviance explained = 23.9%
## GCV = 0.81794  Scale est. = 0.81781   n = 711593
plot(h2s_model_e)

gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   5673794  303.1    8396474   448.5    8396474   448.5
## Vcells 518362147 3954.8 1529350983 11668.1 1382492477 10547.6
# Model only the month of the event, October 2021
h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
                     Converted_Angle + elevation + EVI + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                     monthly_oil_1km + monthly_gas_1km + active_1km +
                     num_odor_complaints
                     , 
                   data = full_data %>% filter(year == '2021', month == '10'))
summary(h2s_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(latitude.x, longitude.x, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.456e-07  3.981e-07   2.375 0.017549 *  
## wd_secQ2             3.217e+00  9.490e-01   3.390 0.000699 ***
## wd_secQ3             6.650e+00  9.427e-01   7.054 1.75e-12 ***
## wd_secQ4             8.045e+00  8.902e-01   9.038  < 2e-16 ***
## ws                  -2.347e+00  1.158e-01 -20.262  < 2e-16 ***
## downwind_ref         1.771e+00  8.597e-01   2.060 0.039407 *  
## downwind_wrp         1.307e+00  1.077e+00   1.214 0.224903    
## I(1/MinDist^2)      -3.506e-03  1.585e-03  -2.212 0.026960 *  
## I(1/dist_wrp^2)     -1.201e-03  8.099e-05 -14.835  < 2e-16 ***
## capacity             7.376e-01  1.505e-01   4.899 9.63e-07 ***
## wrp_angle            1.019e+00  1.659e-01   6.146 8.00e-10 ***
## Converted_Angle     -2.272e+00  2.881e-01  -7.885 3.19e-15 ***
## elevation           -2.755e+01  2.379e+00 -11.581  < 2e-16 ***
## EVI                  9.445e+02  1.053e+02   8.973  < 2e-16 ***
## monthly_oil_1km      1.626e-02  7.109e-03   2.287 0.022180 *  
## monthly_gas_1km      2.391e-03  1.057e-03   2.263 0.023612 *  
## active_1km           4.054e-05  1.846e-05   2.196 0.028071 *  
## num_odor_complaints  7.443e-01  6.676e-02  11.150  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df       F
## s(latitude.x,longitude.x)                                1.638  1.638   2.823
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.676 84.948 218.300
##                                                         p-value    
## s(latitude.x,longitude.x)                                 0.173    
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 106/116
## R-sq.(adj) =  0.263   Deviance explained = 26.4%
## GCV = 5458.6  Scale est. = 5451.7    n = 77510
plot(h2s_model_f)

gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   5674648  303.1    8396474   448.5    8396474   448.5
## Vcells 521886125 3981.7 1529350983 11668.1 1382492477 10547.6
# Model data excluding the month of the event, October 2021
h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + 
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
                     Converted_Angle + elevation + EVI + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                     monthly_oil_1km + monthly_gas_1km + active_1km +
                     num_odor_complaints
                     , 
                   data = full_data %>% filter(year != '2021', month != '10'))
summary(h2s_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     downwind_ref + downwind_wrp + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + wrp_angle + Converted_Angle + elevation + EVI + 
##     s(latitude.x, longitude.x, bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + num_odor_complaints
## 
## Parametric coefficients:
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          7.338e-01  1.222e-01    6.005 1.92e-09 ***
## year2022            -2.206e+00  1.614e-01  -13.669  < 2e-16 ***
## year2023            -2.316e+00  1.626e-01  -14.240  < 2e-16 ***
## wd_secQ2            -1.284e-01  2.891e-03  -44.408  < 2e-16 ***
## wd_secQ3            -2.125e-01  2.908e-03  -73.061  < 2e-16 ***
## wd_secQ4            -1.376e-01  2.742e-03  -50.176  < 2e-16 ***
## ws                  -5.836e-02  3.157e-04 -184.875  < 2e-16 ***
## downwind_ref         4.704e-02  2.601e-03   18.085  < 2e-16 ***
## downwind_wrp         3.566e-02  3.303e-03   10.795  < 2e-16 ***
## I(1/MinDist^2)      -1.111e-04  2.839e-06  -39.129  < 2e-16 ***
## I(1/dist_wrp^2)      2.113e-05  5.665e-07   37.307  < 2e-16 ***
## capacity             2.165e-02  2.506e-04   86.397  < 2e-16 ***
## wrp_angle           -9.541e-03  2.403e-04  -39.710  < 2e-16 ***
## Converted_Angle     -1.446e-02  3.410e-04  -42.401  < 2e-16 ***
## elevation            1.177e-01  3.917e-03   30.062  < 2e-16 ***
## EVI                 -5.429e+00  1.549e-01  -35.053  < 2e-16 ***
## monthly_oil_1km     -5.368e-05  2.183e-06  -24.587  < 2e-16 ***
## monthly_gas_1km     -1.204e-03  2.359e-05  -51.012  < 2e-16 ***
## active_1km          -3.450e-03  1.098e-03   -3.141  0.00168 ** 
## num_odor_complaints  4.586e-02  1.469e-03   31.218  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df    F
## s(as.numeric(month))                                     7.999  8.000 1188
## s(latitude.x,longitude.x)                                1.606  1.606 4394
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 72.495 72.496 2308
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(latitude.x,longitude.x)                                <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 106/126
## R-sq.(adj) =  0.273   Deviance explained = 27.3%
## GCV = 0.7076  Scale est. = 0.70754   n = 1028486
plot(h2s_model_g)

gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   5676308  303.2    8396474   448.5    8396474   448.5
## Vcells 570433698 4352.1 1835301179 14002.3 1782965375 13603.0

Daily max H2S

# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
                        I(1/MinDist^2), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg + 
##     ws_avg + I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.062e+00  3.468e-01   8.829   <2e-16 ***
## year2023       -4.367e-01  2.197e-01  -1.988   0.0469 *  
## wd_avg          5.555e-04  1.031e-03   0.539   0.5903    
## ws_avg         -8.938e-02  6.342e-02  -1.409   0.1588    
## I(1/MinDist^2)  7.943e-07  8.452e-08   9.399   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(as.numeric(month)) 2.171      8 2.002   7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 12/13
## R-sq.(adj) =  0.00868   Deviance explained = 1.06%
## GCV = 21.458  Scale est. = 21.408    n = 2664
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp", 
##     k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       9.085e+00  1.486e+01   0.612  0.54088    
## wd_avg                            2.922e-03  1.008e-03   2.900  0.00376 ** 
## ws_avg                           -3.126e-01  6.339e-02  -4.931  8.7e-07 ***
## I(1/MinDist^2)                   -1.612e-05  1.834e-03  -0.009  0.99299    
## RefineryMarathon (Carson)        -2.861e-01  1.840e+01  -0.016  0.98759    
## RefineryMarathon (Wilmington)    -3.058e+00  1.820e+01  -0.168  0.86659    
## RefineryPhillips 66 (Wilmington) -1.378e+01  1.702e+01  -0.809  0.41840    
## RefineryTorrance Refinery        -2.545e+00  1.470e+01  -0.173  0.86261    
## RefineryValero Refinery          -6.947e+00  1.842e+01  -0.377  0.70604    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## s(as.numeric(month))       2.026  8.000 1.698 0.000225 ***
## s(monitor_lat,monitor_lon) 5.408  5.699 9.292 6.79e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 25/26
## R-sq.(adj) =  0.127   Deviance explained = 13.1%
## GCV = 18.972  Scale est. = 18.862    n = 2664
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Converted_Angle+
                        s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.630e+00  8.823e-01   4.114    4e-05 ***
## wd_avg           2.807e-03  1.012e-03   2.775  0.00555 ** 
## ws_avg          -2.243e-01  6.210e-02  -3.612  0.00031 ***
## I(1/MinDist^2)  -8.554e-06  3.810e-05  -0.225  0.82237    
## Converted_Angle -3.393e-03  3.891e-03  -0.872  0.38337    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.103  8.000  2.015 5.44e-05 ***
## s(monitor_lat,monitor_lon) 7.070  7.918 41.050  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 21/22
## R-sq.(adj) =  0.116   Deviance explained = 12.1%
## GCV = 19.175  Scale est. = 19.08     n = 2664
plot(h2s_dm_model_c)

h2s_dm_model_d <- gam(H2S_daily_max ~ 
                         s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + dist_wrp + capacity +
                         Converted_Angle + 
                         s(monitor_lat, monitor_lon, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km
                      , 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp + 
##     capacity + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.172e+01  4.645e+00  -2.524 0.011678 *  
## year2023             -4.292e-01  4.437e-01  -0.967 0.333507    
## as.factor(weekday).L  4.648e-01  2.421e-01   1.920 0.054935 .  
## as.factor(weekday).Q -9.271e-01  2.424e-01  -3.825 0.000134 ***
## as.factor(weekday).C -1.117e-02  2.406e-01  -0.046 0.962981    
## as.factor(weekday)^4 -1.140e-01  2.401e-01  -0.475 0.635008    
## as.factor(weekday)^5 -3.636e-01  2.393e-01  -1.519 0.128806    
## as.factor(weekday)^6 -1.897e-01  2.397e-01  -0.791 0.428742    
## wd_avg                3.774e-03  1.102e-03   3.426 0.000624 ***
## ws_avg               -3.508e-01  6.988e-02  -5.020 5.54e-07 ***
## daily_downwind_ref    8.150e-01  4.117e-01   1.980 0.047827 *  
## I(1/MinDist^2)        1.228e-04  4.155e-04   0.296 0.767571    
## dist_wrp              2.170e-03  7.111e-04   3.051 0.002304 ** 
## capacity              9.117e-03  1.247e-02   0.731 0.464918    
## Converted_Angle       6.839e-03  9.438e-03   0.725 0.468756    
## monthly_oil_1km      -3.487e-05  1.292e-04  -0.270 0.787354    
## monthly_gas_1km      -2.724e-04  5.929e-04  -0.459 0.645948    
## active_1km            5.170e-03  4.278e-02   0.121 0.903816    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F p-value    
## s(as.numeric(month))       1.729  8.000  0.976 0.00459 ** 
## s(monitor_lat,monitor_lon) 7.783  7.975 13.054 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =  0.137   Deviance explained = 14.6%
## GCV = 20.188  Scale est. = 19.967    n = 2428
plot(h2s_dm_model_d)

# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                      data = daily_full %>% filter(day >= '2022-02-01'))

summary(h2s_dm_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               6.666e+00  1.815e+00   3.672 0.000246 ***
## year2023                 -4.106e-01  4.443e-01  -0.924 0.355507    
## as.character(weekday)Mon -6.843e-01  3.373e-01  -2.029 0.042598 *  
## as.character(weekday)Sat -7.572e-01  3.433e-01  -2.206 0.027510 *  
## as.character(weekday)Sun -1.192e+00  3.413e-01  -3.494 0.000485 ***
## as.character(weekday)Thu -3.588e-01  3.417e-01  -1.050 0.293756    
## as.character(weekday)Tue -1.363e-01  3.363e-01  -0.405 0.685185    
## as.character(weekday)Wed  2.723e-02  3.407e-01   0.080 0.936305    
## wd_avg                    3.625e-03  1.105e-03   3.279 0.001057 ** 
## ws_avg                   -3.414e-01  6.961e-02  -4.904 1.00e-06 ***
## daily_downwind_ref        7.746e-01  4.071e-01   1.903 0.057179 .  
## I(1/dist_wrp^2)          -1.882e-06  2.943e-07  -6.394 1.93e-10 ***
## monthly_oil_1km          -4.350e-05  1.272e-04  -0.342 0.732462    
## monthly_gas_1km          -2.489e-04  6.016e-04  -0.414 0.679122    
## active_1km                1.291e-02  4.168e-02   0.310 0.756751    
## daily_downwind_wrp        1.454e-01  4.311e-01   0.337 0.735989    
## elevation                -1.143e-01  6.418e-02  -1.781 0.075053 .  
## EVI                      -3.427e+00  7.699e-01  -4.452 8.90e-06 ***
## num_odor_complaints       1.615e-01  1.555e-01   1.038 0.299280    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(as.numeric(month))                                    1.085  8.000  0.413
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.159  8.756 14.556
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.089 77.000  0.090
##                                                         p-value    
## s(as.numeric(month))                                     0.0156 *  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  0.0122 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 112/113
## R-sq.(adj) =  0.139   Deviance explained = 14.9%
## GCV = 20.176  Scale est. = 19.924    n = 2428
plot(h2s_dm_model_e)

e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(e, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_dm_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                      data = daily_full %>% filter(year == '2021', month == '10'))

summary(h2s_dm_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref + 
##     I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     daily_downwind_wrp + elevation + EVI + num_odor_complaints
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               6.155e-06  1.228e-06   5.013 1.01e-06 ***
## as.character(weekday)Mon  1.349e+00  9.098e+01   0.015 0.988183    
## as.character(weekday)Sat -1.910e+01  7.183e+01  -0.266 0.790587    
## as.character(weekday)Sun -2.350e+01  7.184e+01  -0.327 0.743844    
## as.character(weekday)Thu  1.712e+01  7.449e+01   0.230 0.818387    
## as.character(weekday)Tue -5.373e+01  7.929e+01  -0.678 0.498642    
## as.character(weekday)Wed  5.286e+00  7.495e+01   0.071 0.943831    
## wd_avg                   -1.782e-01  3.223e-01  -0.553 0.580948    
## ws_avg                   -5.224e+00  3.576e+01  -0.146 0.883962    
## daily_downwind_ref       -1.557e+02  9.324e+01  -1.670 0.096196 .  
## I(1/dist_wrp^2)          -4.680e-04  1.258e-04  -3.720 0.000246 ***
## monthly_oil_1km           1.095e-01  2.184e-02   5.013 1.01e-06 ***
## monthly_gas_1km           1.626e-02  3.242e-03   5.013 1.01e-06 ***
## active_1km                2.831e-04  5.648e-05   5.013 1.01e-06 ***
## daily_downwind_wrp       -3.084e+01  1.189e+02  -0.259 0.795605    
## elevation                -9.357e+01  1.750e+01  -5.345 2.03e-07 ***
## EVI                       8.195e+01  2.217e+02   0.370 0.711942    
## num_odor_complaints       9.092e-01  1.619e+00   0.562 0.574839    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.793e+00  8.984 15.11
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1.611e-07 80.000  0.00
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))   0.901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 103/107
## R-sq.(adj) =  0.453   Deviance explained = 49.6%
## GCV = 1.2069e+05  Scale est. = 1.1065e+05  n = 274
plot(h2s_dm_model_f)

f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                      data = daily_full %>% filter(year != '2021', month != '10'))

summary(h2s_dm_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               7.479e+00  2.085e+00   3.587  0.00034 ***
## year2022                 -2.703e-01  8.084e-01  -0.334  0.73814    
## year2023                 -6.711e-01  9.060e-01  -0.741  0.45896    
## as.character(weekday)Mon -7.542e-01  2.311e-01  -3.264  0.00111 ** 
## as.character(weekday)Sat -7.661e-01  2.345e-01  -3.267  0.00110 ** 
## as.character(weekday)Sun -1.090e+00  2.335e-01  -4.669 3.14e-06 ***
## as.character(weekday)Thu -5.043e-01  2.332e-01  -2.163  0.03063 *  
## as.character(weekday)Tue -4.176e-01  2.309e-01  -1.808  0.07062 .  
## as.character(weekday)Wed -2.204e-01  2.331e-01  -0.945  0.34452    
## wd_avg                    2.271e-03  8.069e-04   2.815  0.00491 ** 
## ws_avg                   -3.303e-01  4.852e-02  -6.807 1.17e-11 ***
## daily_downwind_ref        3.628e-01  2.234e-01   1.624  0.10438    
## I(1/dist_wrp^2)          -3.288e-06  4.143e-07  -7.936 2.79e-15 ***
## monthly_oil_1km          -8.192e-05  9.291e-05  -0.882  0.37799    
## monthly_gas_1km           3.606e-04  4.843e-04   0.745  0.45659    
## active_1km                1.483e-02  3.016e-02   0.492  0.62288    
## daily_downwind_wrp        2.541e-01  2.843e-01   0.894  0.37149    
## elevation                -1.432e-01  5.810e-02  -2.464  0.01378 *  
## EVI                      -3.290e+00  6.402e-01  -5.139 2.92e-07 ***
## num_odor_complaints       2.533e-01  1.041e-01   2.432  0.01505 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     1.218  8.000  0.307
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.848  8.955 18.646
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 12.632 66.000  0.564
##                                                          p-value    
## s(as.numeric(month))                                      0.0385 *  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 102/103
## R-sq.(adj) =  0.159   Deviance explained = 16.9%
## GCV = 13.758  Scale est. = 13.596    n = 3540
plot(h2s_dm_model_g)

g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average

# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(monitor_lon, monitor_lat, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(monitor_lon, monitor_lat, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -5.312e+00  6.250e-01  -8.499  < 2e-16 ***
## year2023             -1.285e-01  6.043e-02  -2.127  0.03355 *  
## as.factor(weekday).L  8.010e-02  2.546e-02   3.146  0.00168 ** 
## as.factor(weekday).Q -1.702e-01  2.547e-02  -6.682 2.93e-11 ***
## as.factor(weekday).C  2.449e-02  2.527e-02   0.969  0.33268    
## as.factor(weekday)^4  3.029e-04  2.521e-02   0.012  0.99042    
## as.factor(weekday)^5 -5.445e-02  2.514e-02  -2.166  0.03040 *  
## as.factor(weekday)^6 -5.067e-02  2.517e-02  -2.013  0.04421 *  
## wd_avg                8.357e-04  1.162e-04   7.192 8.49e-13 ***
## ws_avg               -1.196e-01  7.590e-03 -15.758  < 2e-16 ***
## daily_downwind_ref    2.886e-01  4.339e-02   6.651 3.59e-11 ***
## I(1/MinDist^2)        3.248e-04  3.115e-05  10.426  < 2e-16 ***
## I(1/dist_wrp^2)      -1.769e-05  2.306e-06  -7.670 2.49e-14 ***
## capacity              1.673e-02  1.210e-03  13.835  < 2e-16 ***
## Converted_Angle      -2.682e-03  1.048e-03  -2.558  0.01058 *  
## monthly_oil_1km       4.360e-05  1.992e-05   2.189  0.02872 *  
## monthly_gas_1km       1.888e-04  1.009e-04   1.870  0.06156 .  
## active_1km           -2.076e-02  8.581e-03  -2.419  0.01563 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F p-value    
## s(as.numeric(month))       7.485      8  9.614  <2e-16 ***
## s(monitor_lon,monitor_lat) 8.987      9 73.350  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.396   Deviance explained = 40.3%
## GCV = 0.22287  Scale est. = 0.21988   n = 2428
plot(h2s_da_model_a)

a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.724e+00  5.238e-01  -7.109 1.54e-12 ***
## year2023             -1.286e-01  6.043e-02  -2.128  0.03346 *  
## as.factor(weekday).L  8.010e-02  2.546e-02   3.146  0.00168 ** 
## as.factor(weekday).Q -1.702e-01  2.547e-02  -6.681 2.94e-11 ***
## as.factor(weekday).C  2.449e-02  2.527e-02   0.969  0.33269    
## as.factor(weekday)^4  2.901e-04  2.521e-02   0.012  0.99082    
## as.factor(weekday)^5 -5.444e-02  2.514e-02  -2.166  0.03043 *  
## as.factor(weekday)^6 -5.066e-02  2.517e-02  -2.013  0.04426 *  
## wd_avg                8.356e-04  1.162e-04   7.192 8.52e-13 ***
## ws_avg               -1.195e-01  7.589e-03 -15.752  < 2e-16 ***
## daily_downwind_ref    2.886e-01  4.339e-02   6.652 3.58e-11 ***
## I(1/MinDist^2)        1.481e-04  1.546e-05   9.582  < 2e-16 ***
## I(1/dist_wrp^2)      -1.109e-05  1.108e-06 -10.016  < 2e-16 ***
## capacity              1.276e-02  9.036e-04  14.122  < 2e-16 ***
## Converted_Angle      -2.550e-03  1.030e-03  -2.476  0.01335 *  
## monthly_oil_1km       4.361e-05  1.992e-05   2.189  0.02870 *  
## monthly_gas_1km       1.888e-04  1.009e-04   1.871  0.06145 .  
## active_1km           -2.077e-02  8.580e-03  -2.420  0.01559 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df      F p-value    
## s(as.numeric(month))                   7.484      8  9.618  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.980      9 73.203  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.396   Deviance explained = 40.3%
## GCV = 0.22286  Scale est. = 0.21988   n = 2428
plot(h2s_da_model_b)

b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           9.103e-01  2.439e-01   3.733 0.000194 ***
## year2023             -1.563e-01  6.337e-02  -2.467 0.013688 *  
## as.factor(weekday).L  8.209e-02  2.670e-02   3.075 0.002128 ** 
## as.factor(weekday).Q -1.670e-01  2.670e-02  -6.253 4.74e-10 ***
## as.factor(weekday).C  2.391e-02  2.650e-02   0.902 0.366948    
## as.factor(weekday)^4 -5.662e-03  2.643e-02  -0.214 0.830409    
## as.factor(weekday)^5 -4.929e-02  2.635e-02  -1.870 0.061551 .  
## as.factor(weekday)^6 -4.562e-02  2.639e-02  -1.729 0.083972 .  
## wd_avg                7.589e-04  1.217e-04   6.237 5.26e-10 ***
## ws_avg               -9.445e-02  7.733e-03 -12.215  < 2e-16 ***
## daily_downwind_ref    2.650e-01  4.535e-02   5.843 5.84e-09 ***
## I(1/dist_wrp^2)       2.316e-07  5.103e-08   4.539 5.94e-06 ***
## monthly_oil_1km       4.501e-05  2.092e-05   2.152 0.031505 *  
## monthly_gas_1km       2.173e-04  1.059e-04   2.053 0.040221 *  
## active_1km           -2.346e-02  9.008e-03  -2.604 0.009273 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.501  8.000 10.11  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.905  8.997 76.62  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 31/32
## R-sq.(adj) =  0.336   Deviance explained = 34.4%
## GCV = 0.24477  Scale est. = 0.2417    n = 2428
plot(h2s_da_model_c)

c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp + 
##     elevation + EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.527e+00  2.618e-01   5.834 6.14e-09 ***
## year2023                 -1.295e-01  6.042e-02  -2.143  0.03219 *  
## as.character(weekday)Mon -8.859e-02  3.551e-02  -2.495  0.01267 *  
## as.character(weekday)Sat -9.877e-02  3.608e-02  -2.737  0.00624 ** 
## as.character(weekday)Sun -1.985e-01  3.588e-02  -5.532 3.51e-08 ***
## as.character(weekday)Thu -4.873e-02  3.591e-02  -1.357  0.17490    
## as.character(weekday)Tue  1.210e-03  3.539e-02   0.034  0.97273    
## as.character(weekday)Wed  5.394e-02  3.584e-02   1.505  0.13248    
## wd_avg                    8.351e-04  1.162e-04   7.188 8.74e-13 ***
## ws_avg                   -1.200e-01  7.593e-03 -15.804  < 2e-16 ***
## daily_downwind_ref        2.897e-01  4.334e-02   6.684 2.89e-11 ***
## I(1/dist_wrp^2)          -2.132e-07  3.781e-08  -5.640 1.90e-08 ***
## monthly_oil_1km           4.348e-05  1.991e-05   2.184  0.02907 *  
## monthly_gas_1km           1.905e-04  1.009e-04   1.888  0.05920 .  
## active_1km               -2.088e-02  8.576e-03  -2.434  0.01499 *  
## daily_downwind_wrp        5.124e-02  4.532e-02   1.131  0.25835    
## elevation                -1.933e-02  7.594e-03  -2.545  0.01099 *  
## EVI                      -1.223e+00  8.124e-02 -15.055  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df      F p-value    
## s(as.numeric(month))                   7.479  8.000  9.638  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.912  8.997 47.632  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =  0.396   Deviance explained = 40.4%
## GCV = 0.22293  Scale est. = 0.21987   n = 2428
plot(h2s_da_model_d)

d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.024e+00  5.182e-01   3.906 9.65e-05 ***
## year2023                  1.875e-01  1.458e-01   1.286  0.19860    
## as.character(weekday)Mon -8.906e-02  2.900e-02  -3.071  0.00216 ** 
## as.character(weekday)Sat -9.560e-02  2.943e-02  -3.248  0.00118 ** 
## as.character(weekday)Sun -1.987e-01  2.930e-02  -6.781 1.51e-11 ***
## as.character(weekday)Thu -4.744e-02  2.934e-02  -1.617  0.10600    
## as.character(weekday)Tue  1.461e-03  2.895e-02   0.050  0.95974    
## as.character(weekday)Wed  4.602e-02  2.926e-02   1.573  0.11589    
## wd_avg                    6.597e-04  9.982e-05   6.609 4.79e-11 ***
## ws_avg                   -1.106e-01  6.455e-03 -17.141  < 2e-16 ***
## daily_downwind_ref        2.198e-01  3.605e-02   6.097 1.26e-09 ***
## I(1/dist_wrp^2)           4.334e-08  1.194e-07   0.363  0.71674    
## monthly_oil_1km           4.255e-05  3.307e-05   1.287  0.19831    
## monthly_gas_1km           6.281e-05  1.981e-04   0.317  0.75119    
## active_1km               -9.381e-03  1.272e-02  -0.737  0.46107    
## daily_downwind_wrp        5.237e-02  3.768e-02   1.390  0.16472    
## elevation                -6.505e-02  1.038e-02  -6.267 4.38e-10 ***
## EVI                      -1.618e+00  1.281e-01 -12.632  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.524      8  4.006
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 21.163
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.659     77 16.637
##                                                          p-value    
## s(as.numeric(month))                                    4.59e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/112
## R-sq.(adj) =  0.599   Deviance explained = 61.7%
## GCV = 0.15275  Scale est. = 0.14588   n = 2428
plot(h2s_da_model_e)

e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.039e+00  5.177e-01   3.940 8.40e-05 ***
## year2023                  1.934e-01  1.457e-01   1.328  0.18446    
## as.character(weekday)Mon -8.896e-02  2.898e-02  -3.070  0.00217 ** 
## as.character(weekday)Sat -9.447e-02  2.941e-02  -3.212  0.00134 ** 
## as.character(weekday)Sun -1.958e-01  2.930e-02  -6.680 2.98e-11 ***
## as.character(weekday)Thu -4.941e-02  2.933e-02  -1.685  0.09216 .  
## as.character(weekday)Tue  1.885e-03  2.892e-02   0.065  0.94803    
## as.character(weekday)Wed  4.586e-02  2.923e-02   1.569  0.11684    
## wd_avg                    6.445e-04  9.999e-05   6.446 1.40e-10 ***
## ws_avg                   -1.101e-01  6.455e-03 -17.053  < 2e-16 ***
## daily_downwind_ref        2.206e-01  3.603e-02   6.124 1.07e-09 ***
## I(1/dist_wrp^2)           4.611e-08  1.193e-07   0.386  0.69926    
## monthly_oil_1km           3.829e-05  3.308e-05   1.158  0.24706    
## monthly_gas_1km           6.831e-05  1.979e-04   0.345  0.72996    
## active_1km               -8.711e-03  1.271e-02  -0.685  0.49333    
## daily_downwind_wrp        5.350e-02  3.765e-02   1.421  0.15553    
## elevation                -6.481e-02  1.037e-02  -6.248 4.93e-10 ***
## EVI                      -1.610e+00  1.280e-01 -12.578  < 2e-16 ***
## num_odor_complaints       2.989e-02  1.375e-02   2.174  0.02980 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.517      8  4.042
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 21.194
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.661     77 16.635
##                                                          p-value    
## s(as.numeric(month))                                    4.01e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 112/113
## R-sq.(adj) =    0.6   Deviance explained = 61.8%
## GCV = 0.15257  Scale est. = 0.14565   n = 2428
plot(h2s_da_model_f)

f <- getViz(h2s_da_model_f)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
h2s_da_model_g <- gam(H2S_daily_avg~ as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                      data = daily_full %>% filter(year == '2021', month == '10'))
summary(h2s_da_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref + 
##     I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     daily_downwind_wrp + elevation + EVI + num_odor_complaints
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               6.644e-07  1.379e-07   4.816 2.53e-06 ***
## as.character(weekday)Mon -7.118e+00  1.025e+01  -0.695 0.487976    
## as.character(weekday)Sat -6.445e+00  8.092e+00  -0.796 0.426498    
## as.character(weekday)Sun -8.848e+00  8.094e+00  -1.093 0.275368    
## as.character(weekday)Thu  8.558e-02  8.391e+00   0.010 0.991871    
## as.character(weekday)Tue -1.298e+01  8.933e+00  -1.453 0.147520    
## as.character(weekday)Wed -3.811e+00  8.444e+00  -0.451 0.652146    
## wd_avg                   -1.884e-02  3.631e-02  -0.519 0.604363    
## ws_avg                    3.440e-01  4.027e+00   0.085 0.931997    
## daily_downwind_ref       -1.605e+01  1.050e+01  -1.528 0.127716    
## I(1/dist_wrp^2)          -5.240e-05  1.408e-05  -3.720 0.000245 ***
## monthly_oil_1km           1.182e-02  2.454e-03   4.816 2.53e-06 ***
## monthly_gas_1km           1.755e-03  3.643e-04   4.816 2.53e-06 ***
## active_1km                3.056e-05  6.345e-06   4.816 2.53e-06 ***
## daily_downwind_wrp       -1.092e+01  1.340e+01  -0.815 0.415823    
## elevation                -1.008e+01  1.966e+00  -5.129 5.83e-07 ***
## EVI                       1.128e+01  2.497e+01   0.452 0.651847    
## num_odor_complaints       2.407e-01  1.823e-01   1.321 0.187853    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                              edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.78e+00  8.982 12.66
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1.18e-07 80.000  0.00
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))   0.902    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 103/107
## R-sq.(adj) =  0.428   Deviance explained = 47.4%
## GCV = 1531.7  Scale est. = 1404.4    n = 274
plot(h2s_da_model_g)

g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_h <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                      data = daily_full %>% filter(year != '2021', month != '10'))
summary(h2s_da_model_h)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               6.242e+00  1.108e+00   5.634 1.90e-08 ***
## year2022                 -1.400e+00  1.329e+00  -1.053 0.292219    
## year2023                 -1.425e+00  1.342e+00  -1.062 0.288518    
## as.character(weekday)Mon -1.141e-01  2.614e-02  -4.366 1.31e-05 ***
## as.character(weekday)Sat -1.281e-01  2.648e-02  -4.836 1.38e-06 ***
## as.character(weekday)Sun -1.978e-01  2.643e-02  -7.487 8.91e-14 ***
## as.character(weekday)Thu -5.216e-02  2.634e-02  -1.980 0.047761 *  
## as.character(weekday)Tue -6.735e-02  2.612e-02  -2.579 0.009953 ** 
## as.character(weekday)Wed -1.623e-02  2.634e-02  -0.616 0.537705    
## wd_avg                    3.136e-04  9.337e-05   3.359 0.000792 ***
## ws_avg                   -1.054e-01  5.891e-03 -17.893  < 2e-16 ***
## daily_downwind_ref        3.858e-02  2.571e-02   1.501 0.133566    
## I(1/dist_wrp^2)           3.957e-05  1.535e-05   2.578 0.009985 ** 
## monthly_oil_1km          -4.282e-05  1.808e-05  -2.368 0.017934 *  
## monthly_gas_1km          -1.210e-03  1.926e-04  -6.281 3.78e-10 ***
## active_1km               -2.150e-04  9.072e-03  -0.024 0.981094    
## daily_downwind_wrp        6.052e-02  3.248e-02   1.863 0.062480 .  
## elevation                -2.879e-02  7.638e-03  -3.769 0.000166 ***
## EVI                      -1.581e+00  1.146e-01 -13.795  < 2e-16 ***
## num_odor_complaints       3.456e-02  1.227e-02   2.816 0.004887 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.911      8 15.33
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 13.40
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 64.809     66 25.36
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 102/103
## R-sq.(adj) =   0.58   Deviance explained = 59.2%
## GCV = 0.17792  Scale est. = 0.17286   n = 3540
plot(h2s_da_model_h)

h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints + october2021, 
                      data = daily_full %>% mutate(october2021 = if_else(year == '2021', month == '10', 1, 0)))
summary(h2s_da_model_i)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + october2021
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.859e+00  5.194e+00   0.358  0.72050    
## year2021                  4.910e+00  1.540e+00   3.188  0.00144 ** 
## year2022                  4.982e-01  1.754e+00   0.284  0.77644    
## year2023                  2.934e-01  2.027e+00   0.145  0.88490    
## as.character(weekday)Mon -3.154e-01  4.028e-01  -0.783  0.43361    
## as.character(weekday)Sat -2.694e-01  4.025e-01  -0.669  0.50325    
## as.character(weekday)Sun -3.207e-01  4.030e-01  -0.796  0.42616    
## as.character(weekday)Thu -3.159e-01  4.033e-01  -0.783  0.43347    
## as.character(weekday)Tue -6.353e-01  4.016e-01  -1.582  0.11367    
## as.character(weekday)Wed -3.640e-01  4.039e-01  -0.901  0.36752    
## wd_avg                    1.701e-04  1.428e-03   0.119  0.90516    
## ws_avg                   -1.220e-01  9.043e-02  -1.349  0.17748    
## daily_downwind_ref        1.895e-02  4.006e-01   0.047  0.96227    
## I(1/dist_wrp^2)          -2.131e-06  4.452e-06  -0.479  0.63221    
## monthly_oil_1km           8.383e-05  2.178e-04   0.385  0.70038    
## monthly_gas_1km           1.844e-04  8.925e-04   0.207  0.83636    
## active_1km                5.923e-02  7.735e-02   0.766  0.44381    
## daily_downwind_wrp        3.054e-01  4.854e-01   0.629  0.52922    
## elevation                -4.989e-01  9.598e-02  -5.198 2.07e-07 ***
## EVI                       3.145e-01  1.130e+00   0.278  0.78083    
## num_odor_complaints       9.312e-01  2.918e-02  31.917  < 2e-16 ***
## october2021               4.716e+00  6.616e-01   7.129 1.12e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    1.134e-06  8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  6.686e+00  7.115 3.714
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 6.533e+01 80.000 2.512
##                                                          p-value    
## s(as.numeric(month))                                    0.674508    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  0.000462 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/119
## R-sq.(adj) =  0.253   Deviance explained = 26.3%
## GCV = 79.302  Scale est. = 78.213    n = 6771
plot(h2s_da_model_i)

i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

knitr::kable(tibble(Model = c('Since Feb 2022',
                              'October 2021 Only',
                              'Exclude October 2021',
                              'October 2021 Indicator'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2))))
Model R-Sq
Since Feb 2022 0.60
October 2021 Only 0.43
Exclude October 2021 0.58
October 2021 Indicator 0.25

80/20 Cross Validation on Daily Average models

result <- tibble(Model = character(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp', 
                'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km', 
                'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints') 

set.seed(313)

# model 1: since Feb 2022
train <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  filter(day >= '2022-02-01') %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                        data = train)

predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
  (summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)

result <- rbind(result, tibble(Model = 'Since Feb 2022',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))


# Model 2: October 2021 only
train <- daily_full %>% 
  filter(year == '2021', month == '10') %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  filter(year == '2021', month == '10') %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints)`
h2s_da_model_f3 <- gam(H2S_daily_avg~as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                        data = train)

predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
  (summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)

result <- rbind(result, tibble(Model = 'October 2021 Only',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 3: Excluding October 2021
train <- daily_full %>% 
  filter(year != '2021', month != '10') %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  filter(year != '2021', month != '10') %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                        data = train)

predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
  (summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)

result <- rbind(result, tibble(Model = 'Exclude October 2021',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 4: October 2021 Indicator
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  mutate(october2021 = if_else(year == '2021', month == '10', 1, 0)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                    mutate(october2021 = if_else(year == '2021', month == '10', 1, 0)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, october2021)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         october2021, 
                        data = train)

predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)

result <- rbind(result, tibble(Model = 'October 2021 Indicator',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))
knitr::kable(tibble(Model = c('Since Feb 2022',
                              'October 2021 Only',
                              'Exclude October 2021',
                              'October 2021 Indicator'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)))
Model R-Sq 80/20 Train R-Sq 80/20 Test R-Sq
Since Feb 2022 0.60 0.5798024 0.6412991
October 2021 Only 0.43 0.3960353 -0.0016982
Exclude October 2021 0.58 0.5850159 0.5312950
October 2021 Indicator 0.25 0.2652137 0.5154652

Cross Validation on each monitor

result_cv <- tibble(Model = character(),
                 'CV Avg Train R-Sq' = numeric(),
                 'CV Avg Test R-Sq' = numeric())

# model 1: since Feb 2022
post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(post2022feb$Monitor)

cv1_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- post2022feb %>%
    filter(Monitor != monitors[i])
  
  test <- post2022feb %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv1_result <- rbind(cv1_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
                                           'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))

# model 2: October 2021 only
october2021 <- daily_full %>% 
  filter(year == '2021', month == '10') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(october2021$Monitor)

cv2_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- october2021 %>%
    filter(Monitor != monitors[i])
  
  test <- october2021 %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv2_result <- rbind(cv2_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'October 2021 Only',
                                           'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))

# model 3: Exclude October 2021 
exclude_oct2021 <- daily_full %>% 
  filter(year != '2021', month != '10') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(exclude_oct2021$Monitor)

cv3_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- exclude_oct2021 %>%
    filter(Monitor != monitors[i])
  
  test <- exclude_oct2021 %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv3_result <- rbind(cv3_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude October 2021',
                                           'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))

# model 4: October 2021 Indicator 
full_complete <- daily_full %>% 
  filter(year != '2021', month != '10') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.)) %>%
  mutate(october2021 = if_else(year == '2021', month == '10', 1, 0))

monitors <- unique(full_complete$Monitor)

cv4_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         october2021, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv4_result <- rbind(cv4_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'October 2021 Indicator',
                                           'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))

cv1_result
cv2_result
cv3_result
cv4_result
knitr::kable(tibble(Model = c('Since Feb 2022',
                              'October 2021 Only',
                              'Exclude October 2021',
                              'October 2021 Indicator'),
                    'Train R-sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)) %>%
               left_join(result_cv, join_by(Model)))
Model Train R-sq 80/20 Train R-Sq 80/20 Test R-Sq CV Avg Train R-Sq CV Avg Test R-Sq
Since Feb 2022 0.60 0.5798024 0.6412991 0.6057831 0.1821341
October 2021 Only 0.43 0.3960353 -0.0016982 0.7539210 -0.4691843
Exclude October 2021 0.58 0.5850159 0.5312950 0.5779813 0.0773571
October 2021 Indicator 0.25 0.2652137 0.5154652 0.5779813 0.0770660

Monthly

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       2.299e+00  6.337e-01   3.628 0.000285 ***
## year2021                          7.513e+00  3.587e-01  20.948  < 2e-16 ***
## year2022                         -9.719e+00  3.990e-01 -24.359  < 2e-16 ***
## year2023                         -4.519e+00  5.031e-01  -8.983  < 2e-16 ***
## wd_secQ2                         -1.993e-01  3.750e-01  -0.531 0.595141    
## wd_secQ3                          3.380e+00  3.813e-01   8.865  < 2e-16 ***
## wd_secQ4                         -1.999e+00  3.604e-01  -5.546 2.93e-08 ***
## ws                                1.983e-01  4.050e-02   4.898 9.68e-07 ***
## I(1/MinDist^2)                    5.375e+05  3.251e+05   1.653 0.098241 .  
## RefineryMarathon (Carson)         4.189e+01  5.190e-01  80.710  < 2e-16 ***
## RefineryMarathon (Wilmington)     4.183e+00  6.036e-01   6.930 4.21e-12 ***
## RefineryPhillips 66 (Wilmington)  3.237e+00  5.332e-01   6.070 1.28e-09 ***
## RefineryTorrance Refinery        -3.331e+00  4.910e-01  -6.785 1.16e-11 ***
## RefineryValero Refinery           6.077e+00  5.903e-01  10.294  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.999      8 4076  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0271   Deviance explained = 2.71%
## GCV =  24024  Scale est. = 24024     n = 2056378
plot(h2s_ma_model_a)

h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude.x, longitude.x, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude.x, longitude.x, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       247.73043 1330.12474   0.186 0.852252    
## year2021                            4.46631    0.34810  12.830  < 2e-16 ***
## year2022                           -5.16925    0.39625 -13.045  < 2e-16 ***
## year2023                            7.01499    0.49759  14.098  < 2e-16 ***
## wd_secQ2                            0.75576    0.34603   2.184 0.028956 *  
## wd_secQ3                            0.04189    0.35248   0.119 0.905406    
## wd_secQ4                           -1.22491    0.33302  -3.678 0.000235 ***
## ws                                 -0.01374    0.03742  -0.367 0.713562    
## I(1/MinDist^2)                      1.80216    0.32232   5.591 2.26e-08 ***
## RefineryMarathon (Carson)        -768.08881 1478.49393  -0.520 0.603407    
## RefineryMarathon (Wilmington)      93.90759 1452.37996   0.065 0.948447    
## RefineryPhillips 66 (Wilmington)    0.48764 1513.24530   0.000 0.999743    
## RefineryTorrance Refinery        -241.61130 1400.50381  -0.173 0.863031    
## RefineryValero Refinery          -388.51087 1432.72236  -0.271 0.786261    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df     F p-value    
## s(as.numeric(month))      7.999      8  4850  <2e-16 ***
## s(latitude.x,longitude.x) 7.000      7 51324  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.172   Deviance explained = 17.2%
## GCV =  20447  Scale est. = 20446     n = 2056378
plot(h2s_ma_model_b)

Try XGBoost on Daily Average

daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Monitor, ' ', '_'),
         weekday = as.character(weekday),
         daily_downwind_ref = as.integer(daily_downwind_ref),
         daily_downwind_wrp = as.integer(daily_downwind_wrp))

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01')

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
                             monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
                             mon_utm_x, mon_utm_y, county)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 478.6 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 38 
## niter: 200
## nfeatures : 38 
## xNames : monitor_lat monitor_lon MinDist Converted_Angle monthly_oil_1km monthly_gas_1km dist_wrp capacity active_1km wrp_angle ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 104     200         5 0.1  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top=10)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 43
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 10)

CV by leaving each monitor out once

post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01')

monitor_names <- unique(post2022feb$Monitor)
# 
# for (i in 1:length(monitor_names)) {
#   train <- post2022feb %>%
#     filter(Monitor != monitor_names[i])
# 
#   train <- train[complete.cases(train),]
# 
#   train <- fastDummies::dummy_cols(train %>%
#                      select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
#                                monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
#                                mon_utm_x, mon_utm_y, county)) %>%
#                      mutate(MinDist = 1/(MinDist^2),
#                             dist_wrp = 1/(dist_wrp^2)),
#                      remove_selected_columns = TRUE)
# 
#   fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# 
#   saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())

add_cols <- function(df, cols) {
  add <- cols[!cols %in% names(df)]
  if(length(add) !=0 ) df[add] <- NA
  return(df)
}

for (i in setdiff(1:length(monitor_names), c(9))) {

  fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
  
  test <- post2022feb %>%
    filter(Monitor == monitor_names[i])

  test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
                     ungroup() %>%
                     select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
                           monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
                           mon_utm_x, mon_utm_y, county)) %>%
                       add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
                                  'month_03', 'month_04', 'month_05', 'month_06', 
                                  'month_07', 'month_08', 'month_09', 'month_10',
                                  'month_11', 'month_12')) %>%
                     mutate(MinDist = 1/(MinDist^2),
                            dist_wrp = 1/(dist_wrp^2),
                            year_2022 = ifelse(is.na(year_2022), 0, year_2022),
                            year_2023 = ifelse(is.na(year_2023), 0, year_2023),
                            month_01 = ifelse(is.na(month_01), 0, month_01),
                            month_02 = ifelse(is.na(month_02), 0, month_02),
                            month_03 = ifelse(is.na(month_03), 0, month_03),
                            month_04 = ifelse(is.na(month_04), 0, month_04),
                            month_05 = ifelse(is.na(month_05), 0, month_05),
                            month_06 = ifelse(is.na(month_06), 0, month_06),
                            month_07 = ifelse(is.na(month_07), 0, month_07),
                            month_08 = ifelse(is.na(month_08), 0, month_08),
                            month_09 = ifelse(is.na(month_09), 0, month_09),
                            month_10 = ifelse(is.na(month_10), 0, month_10),
                            month_11 = ifelse(is.na(month_11), 0, month_11),
                            month_12 = ifelse(is.na(month_12), 0, month_12)),
                     remove_selected_columns = TRUE)
  
  train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared
  
  predictions <- predict(fit.xgb_da$finalModel, 
                         newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
  
  test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)
  
  model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}

model_stats %>% rbind(tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
                             test_r2 = mean(model_stats$test_r2, na.rm = T)))